/*=======================================================================
Creator: Jingyuan Wang, jingyuanwang@u.northwestern.edu
Date created: 				07/17/2018
Date last modified: 		
Purpose: 

1. Import all the ASM energy-related dataset
2. Merge/append/clean and get the energy-related variables
	
There are 3 kinds of data set contains energy related variables:
	1. __GS101 : 
			Annual Survey of Manufactures: General Statistics: Statistics for Industry Groups and Industries
			available from 2004-2016, each year has different variables
			Only files with year == 2013-2016, 2008-2011, 2006 have energy related variabes
	2. __GS201 : 
			Annual Survey of Manufactures: General Statistics: Statistics for Industry Groups and Industries: 2011 REFRESH
			available for 2011
	3. __GS104 : 
			Annual Survey of Manufactures: General Statistics: Purchased Fuels and Electric Energy Used for Heat and Power by Industry Groups and Industries
			available for 2004 and 2005
	4. __GS108 : 
			Annual Survey of Manufactures: General Statistics: Cost of Materials
			available for 2005

Rules of chosing data from different version:
	1. __GS101 typically include 2 years, the current year and an updated version of the previous year.
				I will use the lasted __GS101 version
	2. __GS201 
				I will use 2011 data from GS201 instead of GS101 
				However, I am still using the 2010 data from file 2011_GS101.
	3. __GS104 
				I will use 2005 data because this version include 2002-2005 data
	4. __GS108: duplicated information. Already included in GS104.

	
* This file collect energy related variables from 2002-2016.
* shipment variables and other control variables (like employment/wage/...) are only collected from 2005-2016.
* Because the data quality before 2005 is bad, and NBER data are available from 1990-2011.
==========================================================================*/


*************************************************************************
* 				PART A. Import data	end with GS101  					*
*************************************************************************

tempfile ASM
*********************************
* 2016
*********************************
local year = 2016
import delimited "$ASMraw/ASM_`year'_31GS101/ASM_`year'_31GS101_with_ann.csv", varn(1) clear
foreach var of varlist geoid-pchoexp_s {
	disp("`var'")
	local lab = `var'[1]
	label var `var' "`lab'"
}
*
drop if _n == 1

keep geoid geodisplaylabel naicsid naicsdisplaylabel yearid ///
	cstfu cstfu_s cstelec cstelec_s elecpch elecpch_s elecgen elecgen_s elecsld elecsld_s ///
	rcptot rcptot_s ///
	emp payann empavpw hours payanpw cstmtot valadd cextot invtote

gen version = "`year'"
save `ASM.dta', replace

*********************************
* 2015 - 2012
*********************************
forvalues year = 2013(1)2015 {
	import delimited "$ASMraw/ASM_`year'_31GS101/ASM_`year'_31GS101_with_ann.csv", varn(1) clear
	foreach var of varlist geoid-pchoexp_s {
		disp("`var'")
		local lab = `var'[1]
		label var `var' "`lab'"
	}
	*
	drop if _n == 1

	keep geoid geodisplaylabel naicsid naicsdisplaylabel yearid ///
		cstfu cstfu_s cstelec cstelec_s elecpch elecpch_s elecgen elecgen_s elecsld elecsld_s ///
	    rcptot rcptot_s ///
		emp payann empavpw hours payanpw cstmtot valadd cextot invtote
	
	gen version = "`year'"

	append using `ASM.dta'
	save `ASM.dta', replace
}
* 
*********************************
* 2011-2008
*********************************
forvalue year = 2008(1)2011 {
	import delimited "$ASMraw/ASM_`year'_31GS101/ASM_`year'_31GS101_with_ann.csv", varn(1) clear
	foreach var of varlist geoid-pchoexp {
		disp("`var'")
		local lab = `var'[1]
		label var `var' "`lab'"
	}
	*
	drop if _n == 1

	keep geoid geodisplaylabel naicsid naicsdisplaylabel yearid ///
		cstfu cstfu_s cstelec cstelec_s elecpch  elecgen  elecsld  ///
		rcptot rcptot_s ///
		emp payann empavpw hours payanpw cstmtot valadd cextot invtote
		
	gen version = "`year'"

	append using `ASM.dta'
	save `ASM.dta', replace
}
*

*********************************
* 2006
*********************************
local year = 2006
import delimited "$ASMraw/ASM_`year'_31GS101/ASM_`year'_31GS101_with_ann.csv", varn(1) clear
foreach var of varlist geoid-cexmcho {
	disp("`var'")
	local lab = `var'[1]
	label var `var' "`lab'"
}
*
drop if _n == 1

keep geoid geodisplaylabel naicsid naicsdisplaylabel yearid ///
	cstfu cstfu_s fuelect cstelec cstelec_s elecpch  elecgen  elecsld   ///
	rcptot rcptot_s ///
	empsmao payann empavpw hrstotm payanpw cstmtot valaddm cextot invtote
	
	rename empsmao emp
	rename hrstotm hours
	rename valaddm valadd
	
gen version = "`year'"

append using `ASM.dta'
save `ASM.dta', replace


*************************************************************************
* 				PART B. Import data	end with GS201  					*
*************************************************************************

* 2011
local year = 2011
import delimited "$ASMraw/ASM_`year'_31GS201/ASM_`year'_31GS201_with_ann.csv", varn(1) clear
foreach var of varlist geoid-pchoexp {
	disp("`var'")
	local lab = `var'[1]
	label var `var' "`lab'"
}
*
drop if _n == 1

keep geoid geodisplaylabel naicsid naicsdisplaylabel yearid ///
	cstfu cstfu_s elecpch cstelec cstelec_s elecgen elecsld ///
	rcptot rcptot_s ///
	emp payann empavpw hours payanpw cstmtot valadd cextot invtote

gen version = "`year'REFRESH"
append using `ASM.dta'
save `ASM.dta', replace


*************************************************************************
* 				PART C. Import data	end with GS104  					*
*************************************************************************
forvalues year = 2004(1)2005 {
	import delimited "$ASMraw/ASM_`year'_31GS104/ASM_`year'_31GS104_with_ann.csv", varn(1) clear
	foreach var of varlist geoid-cstfu_s {
		disp("`var'")
		local lab = `var'[1]
		label var `var' "`lab'"
	}
	*
	drop if _n == 1

	keep geoid geodisplaylabel naicsid naicsdisplaylabel yearid ///
		cstfu cstfu_s fuelect cstelec cstelec_s elecpch  elecgen  elecsld 

	gen version = "`year' Heat and Power"

	append using `ASM.dta'
	save `ASM.dta', replace
}
*



*************************************************************************
* 				PART D. Import data	end with GS108  					*
*************************************************************************
local year = 2005
import delimited "$ASMraw/ASM_`year'_31GS108/ASM_`year'_31GS108_with_ann.csv", varn(1) clear
foreach var of varlist geoid-cstmtot_s {
	disp("`var'")
	local lab = `var'[1]
	label var `var' "`lab'"
}
*
drop if _n == 1

keep geoid geodisplaylabel naicsid naicsdisplaylabel yearid ///
		cstfu cstelec

gen version = "`year' Materials"

append using `ASM.dta'
save `ASM.dta', replace


*************************************************************************
* 				PART E. Drop duplicated versions		  				*
* most of the annual survey include data of 2 years						*
* I will keep the most recent version 									*
*************************************************************************

* 1. drop duplicated data 
drop if version == "2004 Heat and Power" // because "2005 " includes all the data in the 2004 file 
drop if version == "2005 Materials" // because it includes repeated and identical information with 2005 Heat Power
drop fuelect  // because this variable = cstfu + cstelec, also this is only inclulded in the 2005 and 2006 version


* 2. keep only the most up-to-date version
drop if version == "2005 Heat and Power" & yearid == "2005"
drop if version == "2008" & yearid == "2008"
drop if version == "2009" & yearid == "2009"
drop if version == "2010" & yearid == "2010"
drop if version == "2011" & yearid == "2011"
drop if version == "2013" & yearid == "2013"
drop if version == "2014" & yearid == "2014"
drop if version == "2015" & yearid == "2015"



*************************************************************************
* 				PART F. Clean Data					  					*
* vairable and variable labels from different version of the data		*
*************************************************************************


*********************************
* Clean variables
*********************************
drop geoid geodisplaylabel

* 1. destring and rename
destring yearid, gen(year)
drop yearid
destring cstfu, 	gen(cst_fuel) force
destring cstelec, 	gen(cst_elec) force
destring elecpch, 	gen(q_elec_pch) force
destring elecgen, 	gen(q_elec_gen) force
destring elecsld,	gen(q_elec_sld) force
destring rcptot,	gen(shipment) force
destring emp, 		replace force
destring payann,	gen(pay) force
destring empavpw,	gen(prode) force
destring hours,		gen(prodh) force
destring payanpw, 	gen(prodw) force
destring cstmtot,	gen(matcost) force
destring valadd,	gen(vadd) force
destring invtote,	gen(invent) force 
destring cextot,	gen(invest) force

keep naicsid naicsdisplaylabel year ///
		shipment cst_fuel cst_elec q_elec_pch   ///
		 q_elec_gen  q_elec_sld   ///
		 emp pay prode prodh prodw matcost vadd invest invent
		 
order naicsid naicsdisplaylabel year ///
		shipment cst_fuel cst_elec q_elec_pch   ///
		 q_elec_gen  q_elec_sld   ///
		 emp pay prode prodh prodw matcost vadd invest invent

* 2. separate aggregated sectors
replace naicsid = "99" if naicsid == "31-33"
gen level = strlen(naicsid)
lab def level 2 "Total" 3 "3-digit sector" 4 "4-digit industry" 5 "5-digit industry" 6 "6-digit industry"
lab value level level

* 3. destring NAICS code, note that *****N means others, *****M means comprised of two or more NAICS industry codes
rename naicsid naicsid_str
label var naicsid_str "naics code from the survey. (the code is the current survey year naics code)"

destring naicsid_str, gen(naicsid) force
gen naicsid_5digit = substr(naicsid_str, 1, 5)

gen code = substr(naicsid_str, 6, 1)
replace code = "6-digit" if code != "M" & code != "N" & code != "P" & level == 6
replace code = "5-digit" if code != "M" & code != "N" & code != "P" & level == 5
replace code = "4-digit" if code != "M" & code != "N" & code != "P" & level == 4
replace code = "3-digit" if code != "M" & code != "N" & code != "P" & level == 3
replace code = "total"   if code != "M" & code != "N" & code != "P" & level == 2
order code, a(level)


*************************************************************************
* 				PART G. SAVE						  					*
*************************************************************************
order naicsid naicsid_str naicsid_5digit code naicsdisplaylabel level

label data "raw ASM data with simple interpolation"

save "$ASM/ASM.dta", replace
